Ch.9.6.1 – Support Vector Classifier

set.seed (1)
nObs <- 10
x <- matrix (rnorm (2*nObs*2) , ncol =2)
y <- c(rep (-1,nObs) , rep (1 ,nObs) )
x[y==1 ,] <- x[y==1,] + 1
# not linearly separable:
plot(x, col =(3-y), pch =(3-y))

# outcome has to be a factor:
dat <- data.frame(x=x,y=as.factor(y))

# cost=10, no scaling this time:
svmfit <- svm(y~., data=dat, kernel="linear", cost=10, scale=FALSE)
# one(?) misclassification, "X" indicates SV:
plot(svmfit,dat)

# seven of them:
svmfit$index
## [1]  1  2  5  7 14 16 17
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
# actually three misclassifications:
table(predict(svmfit),y)
##     y
##      -1 1
##   -1  8 1
##   1   2 9
plot(x, col =(3-y),pch=as.numeric(predict(svmfit)))
text(-0.9,2.3-(0:2)/5,c("Truth:",as.character(unique(y))),col=c(1,unique(3-y)),pos=2)
text(2,-1.5,"Prediction:",pos=4)
legend("bottomright",levels(predict(svmfit)),pch=1:2,bty="n")

# lower cost=0.1, wider margin, more SVs:
svmfit <- svm(y~., data=dat, kernel="linear", cost=0.1, scale=FALSE)
plot(svmfit,dat)

# 16 of them:
svmfit$index
##  [1]  1  2  3  4  5  7  9 10 12 13 14 15 16 17 18 20
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 0.1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
table(predict(svmfit),y)
##     y
##      -1  1
##   -1 10  1
##   1   0  9
# now only one misclassification:
plot(x, col =(3-y),pch=as.numeric(predict(svmfit)))

# tune cost by cross-validation:
set.seed(1)
tune.out <- tune(svm, y~., data=dat, kernel="linear", ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
# cost=0.1 is the best:
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.1 
## 
## - Detailed performance results:
##    cost error dispersion
## 1 1e-03  0.70  0.4216370
## 2 1e-02  0.70  0.4216370
## 3 1e-01  0.10  0.2108185
## 4 1e+00  0.15  0.2415229
## 5 5e+00  0.15  0.2415229
## 6 1e+01  0.15  0.2415229
## 7 1e+02  0.15  0.2415229
# best model:
bestmod <- tune.out$best.model
summary(bestmod)
## 
## Call:
## best.tune(method = svm, train.x = y ~ ., data = dat, ranges = list(cost = c(0.001, 
##     0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
# denser grid around minimum:
tune.out.1 <- tune(svm, y~., data=dat, kernel="linear", ranges=list(cost=c(0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1)))
summary(tune.out.1)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.5
## 
## - best performance: 0.05 
## 
## - Detailed performance results:
##   cost error dispersion
## 1 0.01  0.35  0.4116363
## 2 0.02  0.35  0.4116363
## 3 0.05  0.25  0.3535534
## 4 0.10  0.10  0.2108185
## 5 0.20  0.20  0.2581989
## 6 0.50  0.05  0.1581139
## 7 1.00  0.15  0.2415229
# test data set (try other than 10 nTestObs, compare to cost=0.01 below):
nTestObs <- 10
#nTestObs <- 1000
xtest <- matrix (rnorm (2*nTestObs*2) , ncol =2)
ytest <- sample (c(-1,1) , 2*nTestObs, rep=TRUE)
xtest[ytest ==1,] <- xtest[ytest ==1,] + 1
testdat <- data.frame(x=xtest, y=as.factor(ytest))

ypred <-  predict(bestmod,testdat)
# about 25% error rate:
table(predict=ypred, truth=testdat$y)
##        truth
## predict -1 1
##      -1  8 3
##      1   0 9
# cost=0.01
svmfit <- svm(y~., data=dat, kernel="linear", cost=0.01, scale=FALSE)
# 20 SVs:
plot(svmfit,dat)

ypred <- predict(svmfit, testdat)
# about 28% error rate:
table(predict=ypred, truth=testdat$y)
##        truth
## predict -1 1
##      -1  8 4
##      1   0 8

with n=2000 the effect of misclassification (slightly higher for cost=0.01 than for cost=0.1) is more apparent than two instead of one misclassification.

# linearly (barely) separable case:
x[y==1,] <- x[y==1,]+0.5
plot(x, col=(y+5)/2, pch=(y+5)/2+15)

dat <- data.frame(x=x,y=as.factor(y))
# high cost for perfect separation:
svmfit <- svm(y~., data=dat, kernel ="linear", cost=1e5)
plot(svmfit,dat)

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1e+05)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1e+05 
## 
## Number of Support Vectors:  3
## 
##  ( 1 2 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
# perfect:
table(predict(svmfit),y)
##     y
##      -1  1
##   -1 10  0
##   1   0 10
# test error on 20K observations:
nTestObs <- 10000
xtest <- matrix (rnorm (2*nTestObs*2) , ncol =2)
ytest <- sample (c(-1,1) , 2*nTestObs, rep=TRUE)
xtest[ytest ==1,] <- xtest[ytest ==1,] + 1 + 0.5
testdat <- data.frame(x=xtest, y=as.factor(ytest))

# about 15-18% error on test data:
ypred <-  predict(svmfit,testdat)
table(predict=ypred, truth=testdat$y)
##        truth
## predict   -1    1
##      -1 8399 1535
##      1  1712 8354
# lower cost, more SVs, one misclassification:
svmfit <- svm(y~., data=dat, kernel="linear", cost=1)
plot(svmfit,dat)

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
table(predict(svmfit),y)
##     y
##      -1  1
##   -1 10  1
##   1   0  9
# better performance on test data:
# about 13-15% error in test data:
ypred <-  predict(svmfit,testdat)
table(predict=ypred, truth=testdat$y)
##        truth
## predict   -1    1
##      -1 8569 1320
##      1  1542 8569

In line with ISLR conclusion that model fit with cost=1 “will perform better on test data than the model with cost=1e5.”

Ch.9.6.2 – Support Vector Machine

set.seed(1)
nObs <- 100
x <- matrix(rnorm(2*nObs*2),ncol=2)
x[1:nObs,] <- x[1:nObs,]+2
x[nObs+(1:(nObs/2)),] <- x[nObs+(1:(nObs/2)),]-2
y <- c(rep(1,1.5*nObs),rep(2,nObs/2))
dat <- data.frame(x=x,y=as.factor(y))
# non-linear decision boundary:
plot(x,col=y,pch=y)

train <- sample(2*nObs,nObs)
# non-linear kernel, cost=1:
svmfit <- svm(y~., data=dat[train,], kernel="radial", gamma=25, cost=1)
plot(svmfit,dat[train,])

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
##     gamma = 25, cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  89
## 
##  ( 26 63 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2
table(predict(svmfit),dat[train,'y'])
##    
##      1  2
##   1 73  5
##   2  0 22
# cost=100K, more irregular decision boundary, fewer SVs, lower training error:
svmfit <- svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1e5)
plot(svmfit,dat[train,])

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
##     gamma = 1, cost = 1e+05)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1e+05 
## 
## Number of Support Vectors:  26
## 
##  ( 12 14 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2
table(predict(svmfit),dat[train,'y'])
##    
##      1  2
##   1 73  1
##   2  0 26
# tune by cross-validation:
set.seed(1)
tune.out=tune(svm, y~., data=dat[train,], kernel="radial",ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
# best cost=1, gamma=2:
summary (tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     2
## 
## - best performance: 0.12 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-01   0.5  0.27 0.11595018
## 2  1e+00   0.5  0.13 0.08232726
## 3  1e+01   0.5  0.15 0.07071068
## 4  1e+02   0.5  0.17 0.08232726
## 5  1e+03   0.5  0.21 0.09944289
## 6  1e-01   1.0  0.25 0.13540064
## 7  1e+00   1.0  0.13 0.08232726
## 8  1e+01   1.0  0.16 0.06992059
## 9  1e+02   1.0  0.20 0.09428090
## 10 1e+03   1.0  0.20 0.08164966
## 11 1e-01   2.0  0.25 0.12692955
## 12 1e+00   2.0  0.12 0.09189366
## 13 1e+01   2.0  0.17 0.09486833
## 14 1e+02   2.0  0.19 0.09944289
## 15 1e+03   2.0  0.20 0.09428090
## 16 1e-01   3.0  0.27 0.11595018
## 17 1e+00   3.0  0.13 0.09486833
## 18 1e+01   3.0  0.18 0.10327956
## 19 1e+02   3.0  0.21 0.08755950
## 20 1e+03   3.0  0.22 0.10327956
## 21 1e-01   4.0  0.27 0.11595018
## 22 1e+00   4.0  0.15 0.10801234
## 23 1e+01   4.0  0.18 0.11352924
## 24 1e+02   4.0  0.21 0.08755950
## 25 1e+03   4.0  0.24 0.10749677
# 10/100 misclassifications:
table(true=dat[-train ,"y"],pred=predict(tune.out$best.model,newdata=dat[-train ,]))
##     pred
## true  1  2
##    1 74  3
##    2  7 16
# 13/100 misclassifications for high cost model:
table(true=dat[-train ,"y"],pred=predict(svmfit,newdata=dat[-train ,]))
##     pred
## true  1  2
##    1 72  5
##    2  8 15

Ch.9.6.3 – ROC curves

# wrap ROCR functions:
rocplot <- function(pred, truth, ...) {
  predob <- prediction(pred, truth)
  perf <- performance(predob, "tpr", "fpr")
  plot(perf ,...)
}

# best model:
svmfit.opt <- svm(y~., data=dat[train,], kernel="radial", gamma=2, cost=1, decision.values=TRUE)
plot(svmfit.opt,dat[train,])

table(predict(svmfit.opt),dat[train,"y"])
##    
##      1  2
##   1 69  5
##   2  4 22
# more flexible model (higher gamma):
svmfit.flex <- svm(y~., data=dat[train,], kernel="radial", gamma=50, cost=1, decision.values=TRUE)
plot(svmfit.flex,dat[train,])

table(predict(svmfit.flex),dat[train,"y"])
##    
##      1  2
##   1 73  3
##   2  0 24
# ROC curves for training and test predictions:
old.par <- par(mfrow=c(1,2))
fitted.opt <- attributes(predict(svmfit.opt ,dat[train,], decision.values =TRUE))$decision.values
fitted.flex <- attributes(predict(svmfit.flex,dat[train,], decision.values=TRUE))$decision.values
rocplot(fitted.opt,dat[train,"y"], main="Training Data")
rocplot(fitted.flex,dat[train,"y"], add=TRUE,col ="red ")
legend("bottomright",c("gamma=2","gamma=50"),col=1:2,text.col=1:2,lty=1) 
fitted.opt <- attributes(predict(svmfit.opt ,dat[-train,], decision.values =TRUE))$decision.values
fitted.flex <- attributes(predict(svmfit.flex,dat[-train,], decision.values=TRUE))$decision.values
rocplot(fitted.opt,dat[-train,"y"], main="Test Data")
rocplot(fitted.flex,dat[-train,"y"], add=TRUE,col ="red ")
legend("bottomright",c("gamma=2","gamma=50"),col=1:2,text.col=1:2,lty=1) 

par(old.par)

Notice how ROC curves swap their relative positions for training and test data for the models considered here

Ch.9.6.4 – SVM with multiple classes

set.seed(1)
# add points for the third class:
x <- rbind(x,matrix(rnorm(50*2),ncol=2))
y <- c(y,rep(0,50))
x[y==0,2] <- x[y==0,2]+2
dat <- data.frame(x=x,y=as.factor(y))
plot(x,col=(y+1),pch=(y+1))

svmfit <- svm(y~.,data=dat,kernel="radial",cost=10, gamma=1)
plot(svmfit,dat)

Ch.9.6.5 – gene expression data

names(Khan)
## [1] "xtrain" "xtest"  "ytrain" "ytest"
dim(Khan$xtrain)
## [1]   63 2308
dim(Khan$xtest)
## [1]   20 2308
length(Khan$ytrain)
## [1] 63
length(Khan$ytest)
## [1] 20
table(Khan$ytrain)
## 
##  1  2  3  4 
##  8 23 12 20
table(Khan$ytest)
## 
## 1 2 3 4 
## 3 6 6 5
dat <- data.frame(x=Khan$xtrain, y=as.factor(Khan$ytrain))
out <- svm(y~., data=dat, kernel="linear", cost=10)
summary(out)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
## 
## Number of Support Vectors:  58
## 
##  ( 20 20 11 7 )
## 
## 
## Number of Classes:  4 
## 
## Levels: 
##  1 2 3 4
table(out$fitted,dat$y)
##    
##      1  2  3  4
##   1  8  0  0  0
##   2  0 23  0  0
##   3  0  0 12  0
##   4  0  0  0 20
dat.te <- data.frame(x=Khan$xtest, y=as.factor(Khan$ytest))
pred.te <- predict(out,newdata=dat.te)
table(pred.te,dat.te$y)
##        
## pred.te 1 2 3 4
##       1 3 0 0 0
##       2 0 6 2 0
##       3 0 0 4 0
##       4 0 0 0 5
# pool training and test data together, obtained resampling-based test error estimate:
KhanAllDat <- data.frame(x=rbind(Khan$xtrain,Khan$xtest),y=as.factor(c(Khan$ytrain,Khan$ytest)))
# repeatedly draw training and test data stratified by cell type;
testPred <- NULL
testTruth <- NULL
nSim <- 100
for ( iSim in 1:nSim ) {
  trainIdx <- NULL
  varIdx <- c(sample(ncol(KhanAllDat)-1,ncol(KhanAllDat)/5),ncol(KhanAllDat))
  for ( iClass in levels(KhanAllDat$y)) {
    trainIdx <- c(trainIdx,sample((1:nrow(KhanAllDat))[KhanAllDat$y==iClass],sum(KhanAllDat$y==iClass),replace=TRUE))
  }
  svmTrain <- svm(y~., data=KhanAllDat[trainIdx,varIdx], kernel="linear", cost=10)
  #testPred <- c(testPred,attributes(predict(svmTrain,newdata=KhanAllDat[-trainIdx,], decision.values=TRUE))$decision.values)
  testPred <- c(testPred,predict(svmTrain,newdata=KhanAllDat[-trainIdx,varIdx]))
  testTruth <- c(testTruth,KhanAllDat[-trainIdx,"y"])
}
#rocplot(testPred,testTruth, main="Test Data")
table(pred=testPred,truth=testTruth)
##     truth
## pred    1    2    3    4
##    1  381    0    0    0
##    2    0 1036   13   18
##    3    0   12  617    3
##    4    0   21   11  822
table(pred=testPred,truth=testTruth)/nSim
##     truth
## pred     1     2     3     4
##    1  3.81  0.00  0.00  0.00
##    2  0.00 10.36  0.13  0.18
##    3  0.00  0.12  6.17  0.03
##    4  0.00  0.21  0.11  8.22
plot(cmdscale(as.dist(1-cor(t(KhanAllDat[,-ncol(KhanAllDat)]),method="spearman"))),col=as.numeric(KhanAllDat$y))

On average about one observation gets misclassified

Ch.9.7, Ex.4

“Generate a simulated two-class data set with 100 observations and two features in which there is a visible but non-linear separation between the two classes. Show that in this setting, a support vector machine with a polynomial kernel (with degree greater than 1) or a radial kernel will outperform a support vector classifier on the training data. Which technique performs best on the test data? Make plots and report training and test error rates in order to back up your assertions.”

set.seed(123)
n <- 100
d <- 3
xyDat <- data.frame(x=c(rnorm(n/2),rnorm(n/4,mean=-d),rnorm(n/4,mean=d)),y=c(rnorm(n/2),rnorm(n/2,mean=d)),z=factor(c(rep(0,n/2),rep(1,n/2))))
plot(xyDat[,1:2],col=as.numeric(xyDat$z),pch=as.numeric(xyDat$z))

svmLin <- svm(z~., data=xyDat, kernel="linear")
table(xyDat$z,predict(svmLin))
##    
##      0  1
##   0 46  4
##   1  0 50
plot(svmLin,xyDat)

svmRad <- svm(z~., data=xyDat, kernel="radial")
plot(svmRad,xyDat)

table(xyDat$z,predict(svmRad))
##    
##      0  1
##   0 50  0
##   1  0 50
svmPoly <- svm(z~., data=xyDat, kernel="polynomial",coef0=1)
plot(svmPoly,xyDat)

table(xyDat$z,predict(svmPoly))
##    
##      0  1
##   0 50  0
##   1  0 50
n <- 100000
dfTmp <- NULL
for ( iTry in 1:30 ) {
  xyTestDat <- data.frame(x=c(rnorm(n/2),rnorm(n/4,mean=-d),rnorm(n/4,mean=d)),y=c(rnorm(n/2),rnorm(n/2,mean=d)),z=factor(c(rep(0,n/2),rep(1,n/2))))
  tstTblLin <- table(xyTestDat$z,predict(svmLin,newdata = xyTestDat))
  tstTblRad <- table(xyTestDat$z,predict(svmRad,newdata = xyTestDat))
  tstTblPoly <- table(xyTestDat$z,predict(svmPoly,newdata = xyTestDat))
  #print(tstTblLin)
  #print(tstTblRad)
  #print(tstTblPoly)
  #cat("Test error:",1-sum(diag(tstTblLin))/n,1-sum(diag(tstTblRad))/n,1-sum(diag(tstTblPoly))/n,fill=TRUE)
  dfTmp <- rbind(dfTmp,data.frame(model=c("lin","rad","poly"),testerr=1-c(sum(diag(tstTblLin)),sum(diag(tstTblRad)),sum(diag(tstTblPoly)))/n))
}
ggplot(dfTmp,aes(x=model,y=log(testerr),colour=model))+geom_boxplot(fill="white",outlier.colour = NA)+geom_jitter()+theme_bw()

Linear kernel yields much higher error than polynomial or radial, and for the settings considered, polynomial might be marginally better than radial

Ch.9.7, Ex.5

“We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary.We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.”

Ch.9.7, Ex.5(a)

“Generate a data set with n = 500 and p = 2, such that the observations belong to two classes with a quadratic decision boundary between them. For instance, you can do this as follows:”

x1=runif (500) -0.5
x2=runif (500) -0.5
y=1*( x1^2-x2^2 > 0)

Ch.9.7, Ex.5(b)

“Plot the observations, colored according to their class labels. Your plot should display X1 on the x-axis, and X2 on the yaxis.”

plot(x1,x2,col=y+1,pch=y+1)

Ch.9.7, Ex.5(c)

“Fit a logistic regression model to the data, using X1 and X2 as predictors.”

glmRes <- glm(Y~X1+X2,data.frame(Y=factor(y),X1=x1,X2=x2),family=binomial)

Ch.9.7, Ex.5(d)

“Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be linear.”

glmPred <- predict(glmRes,type="response")>0.5
table(y,glmPred)
##    glmPred
## y   FALSE TRUE
##   0   148  108
##   1   166   78
old.par <- par(mfrow=c(1,2))
plot(x1,x2,col=y+1,pch=y+1)
plot(x1,x2,col=1+glmPred,pch=glmPred+1)

par(old.par)

Ch.9.7, Ex.5(e)

“Now fit a logistic regression model to the data using non-linear functions of \(X_1\) and \(X_2\) as predictors (e.g. \(X_1^2\), \(X_1 \times X_2\), \(\log{X_2}\), and so forth).”

# I(X2^2)+
glmResNL <- glm(Y~X1+X2+I(X1^2)+X1:X2,data.frame(Y=factor(y),X1=x1,X2=x2),family=binomial)

Ch.9.7, Ex.5(f)

“Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be obviously non-linear. If it is not, then repeat (a)-(e) until you come up with an example in which the predicted class labels are obviously non-linear.”

glmPredNL <- predict(glmResNL,type="response")>0.5
table(y,glmPredNL)
##    glmPredNL
## y   FALSE TRUE
##   0   206   50
##   1    78  166
old.par <- par(mfrow=c(1,2))
plot(x1,x2,col=y+1,pch=y+1)
plot(x1,x2,col=1+glmPredNL,pch=glmPredNL+1)

par(old.par)

Ch.9.7, Ex.5(g)

“Fit a support vector classifier to the data with X1 and X2 as predictors. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.”

svmLin <- svm(Y~., data=data.frame(Y=factor(y),X1=x1,X2=x2),cost=1e3, kernel="linear")
plot(svmLin,data.frame(Y=factor(y),X1=x1,X2=x2))

table(y,predict(svmLin))
##    
## y     0   1
##   0 137 119
##   1 114 130

Ch.9.7, Ex.5(h)

“Fit a SVM using a non-linear kernel to the data. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.”

svmRad <- svm(Y~., data=data.frame(Y=factor(y),X1=x1,X2=x2), kernel="radial")
plot(svmRad,data.frame(Y=factor(y),X1=x1,X2=x2))

table(y,predict(svmRad))
##    
## y     0   1
##   0 253   3
##   1  24 220

Ch.9.7, Ex.7

“In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.”

class(Auto)
## [1] "data.frame"
dim(Auto)
## [1] 392   9
summary(Auto)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365

Ch.9.7, Ex.7 a)

“Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.”

Auto$mpgCtg <- factor(Auto$mpg>median(Auto$mpg))

Ch.9.7, Ex.7 b)

“Fit a support vector classifier to the data with various values of cost, in order to predict whether a car gets high or low gas mileage. Report the cross-validation errors associated with different values of this parameter. Comment on your results.”

for ( iTry in 1:3 ) {
  tune.out=tune(svm, mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="linear",ranges=list(cost=c(0.1,1,10,100,1000)))
  print(tune.out$best.parameters)
  print(tune.out$best.performance)
}
##   cost
## 1  0.1
## [1] 0.09173077
##   cost
## 2    1
## [1] 0.08929487
##   cost
## 3   10
## [1] 0.09173077
print(summary(tune.out))
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    10
## 
## - best performance: 0.09173077 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-01 0.09673077 0.03468976
## 2 1e+00 0.09929487 0.04172136
## 3 1e+01 0.09173077 0.03821436
## 4 1e+02 0.09429487 0.03984736
## 5 1e+03 0.09429487 0.03984736

Lowest error for range of cost values tested lowest error appears to be about the same considering its dispersion – the actual minimum varies across different trials of cross-validation

Ch.9.7, Ex.7 c)

“Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of gamma and degree and cost. Comment on your results.”

radial

for ( iTry in 1:3 ) {
  tune.out=tune(svm, mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="radial",ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
  print(tune.out$best.parameters)
  print(tune.out$best.performance)
}
##    cost gamma
## 12    1     2
## [1] 0.05621795
##    cost gamma
## 12    1     2
## [1] 0.06134615
##    cost gamma
## 12    1     2
## [1] 0.06121795
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     2
## 
## - best performance: 0.06121795 
## 
## - Detailed performance results:
##     cost gamma      error dispersion
## 1  1e-01   0.5 0.09173077 0.03802512
## 2  1e+00   0.5 0.08160256 0.02631946
## 3  1e+01   0.5 0.06371795 0.03227585
## 4  1e+02   0.5 0.08666667 0.03633961
## 5  1e+03   0.5 0.10948718 0.05473907
## 6  1e-01   1.0 0.09429487 0.04645208
## 7  1e+00   1.0 0.06628205 0.02741579
## 8  1e+01   1.0 0.06384615 0.03472364
## 9  1e+02   1.0 0.07153846 0.03382850
## 10 1e+03   1.0 0.07391026 0.03490442
## 11 1e-01   2.0 0.11474359 0.06739480
## 12 1e+00   2.0 0.06121795 0.03837508
## 13 1e+01   2.0 0.06897436 0.04534367
## 14 1e+02   2.0 0.06641026 0.04232625
## 15 1e+03   2.0 0.06641026 0.04232625
## 16 1e-01   3.0 0.16589744 0.09549575
## 17 1e+00   3.0 0.06621795 0.03997503
## 18 1e+01   3.0 0.08166667 0.04149504
## 19 1e+02   3.0 0.08160256 0.03573688
## 20 1e+03   3.0 0.08160256 0.03573688
## 21 1e-01   4.0 0.30589744 0.13656431
## 22 1e+00   4.0 0.06878205 0.02912269
## 23 1e+01   4.0 0.07653846 0.04350608
## 24 1e+02   4.0 0.07647436 0.03804385
## 25 1e+03   4.0 0.07647436 0.03804385
for ( iTry in 1:3 ) {
  tune.out=tune(svm, mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="radial",ranges=list(cost=c(0.1,0.2,0.5,1,2,5,10),gamma=c(1,1.5,2,2.5,3)))
  print(tune.out$best.parameters)
  print(tune.out$best.performance)
}
##    cost gamma
## 11    1   1.5
## [1] 0.05096154
##    cost gamma
## 18    1     2
## [1] 0.06358974
##    cost gamma
## 19    2     2
## [1] 0.05339744
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     2     2
## 
## - best performance: 0.05339744 
## 
## - Detailed performance results:
##    cost gamma      error dispersion
## 1   0.1   1.0 0.10711538 0.04139838
## 2   0.2   1.0 0.08423077 0.04842486
## 3   0.5   1.0 0.07391026 0.04561258
## 4   1.0   1.0 0.06621795 0.04512550
## 5   2.0   1.0 0.06365385 0.04181930
## 6   5.0   1.0 0.06365385 0.03816607
## 7  10.0   1.0 0.06365385 0.03816607
## 8   0.1   1.5 0.11217949 0.04524771
## 9   0.2   1.5 0.09429487 0.05387705
## 10  0.5   1.5 0.07134615 0.04302684
## 11  1.0   1.5 0.06108974 0.04341768
## 12  2.0   1.5 0.05602564 0.04907948
## 13  5.0   1.5 0.06108974 0.04820169
## 14 10.0   1.5 0.07128205 0.04251310
## 15  0.1   2.0 0.12500000 0.04052400
## 16  0.2   2.0 0.09685897 0.04297417
## 17  0.5   2.0 0.07384615 0.04718382
## 18  1.0   2.0 0.06102564 0.03791371
## 19  2.0   2.0 0.05339744 0.04204578
## 20  5.0   2.0 0.05852564 0.04140676
## 21 10.0   2.0 0.06358974 0.04483800
## 22  0.1   2.5 0.13506410 0.02315208
## 23  0.2   2.5 0.10717949 0.03151662
## 24  0.5   2.5 0.07134615 0.04469241
## 25  1.0   2.5 0.06365385 0.03213671
## 26  2.0   2.5 0.05346154 0.03476043
## 27  5.0   2.5 0.06871795 0.03962831
## 28 10.0   2.5 0.06615385 0.03986724
## 29  0.1   3.0 0.15275641 0.04163109
## 30  0.2   3.0 0.11730769 0.02115493
## 31  0.5   3.0 0.08153846 0.03554569
## 32  1.0   3.0 0.05858974 0.02675671
## 33  2.0   3.0 0.06115385 0.03639285
## 34  5.0   3.0 0.07384615 0.04219942
## 35 10.0   3.0 0.07384615 0.04052039

Roughly best performance by cross-validation seems to be achievable for low single-digits values of cost and gamma. Average performance for radial kernel as estimated by cross-validation appears to be better than that for linear kernel / SVC

polynomial

for ( iTry in 1:3 ) {
  tune.out=tune(svm, mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="polynomial",ranges=list(cost=c(0.1,1,10,100,1000),degree=c(2,3),coef0=c(0,0.5,1,2,3,4)))
  print(tune.out$best.parameters)
  print(tune.out$best.performance)
}
##    cost degree coef0
## 39  100      3     2
## [1] 0.06634615
##    cost degree coef0
## 23   10      2     1
## [1] 0.06391026
##    cost degree coef0
## 29  100      3     1
## [1] 0.07384615
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree coef0
##   100      3     1
## 
## - best performance: 0.07384615 
## 
## - Detailed performance results:
##     cost degree coef0      error dispersion
## 1  1e-01      2   0.0 0.29288462 0.08879147
## 2  1e+00      2   0.0 0.24730769 0.07772246
## 3  1e+01      2   0.0 0.24211538 0.07838593
## 4  1e+02      2   0.0 0.25762821 0.05956324
## 5  1e+03      2   0.0 0.23198718 0.08695949
## 6  1e-01      3   0.0 0.20416667 0.08899015
## 7  1e+00      3   0.0 0.11737179 0.03014631
## 8  1e+01      3   0.0 0.09166667 0.03974704
## 9  1e+02      3   0.0 0.08141026 0.03279758
## 10 1e+03      3   0.0 0.08897436 0.04435992
## 11 1e-01      2   0.5 0.09711538 0.04350298
## 12 1e+00      2   0.5 0.08429487 0.04381671
## 13 1e+01      2   0.5 0.08929487 0.03470503
## 14 1e+02      2   0.5 0.09698718 0.03385798
## 15 1e+03      2   0.5 0.08935897 0.04068502
## 16 1e-01      3   0.5 0.08942308 0.04075011
## 17 1e+00      3   0.5 0.08679487 0.03679088
## 18 1e+01      3   0.5 0.08410256 0.03797147
## 19 1e+02      3   0.5 0.07903846 0.04420346
## 20 1e+03      3   0.5 0.08153846 0.02592049
## 21 1e-01      2   1.0 0.09711538 0.04350298
## 22 1e+00      2   1.0 0.09192308 0.04727122
## 23 1e+01      2   1.0 0.09185897 0.03668505
## 24 1e+02      2   1.0 0.09698718 0.03385798
## 25 1e+03      2   1.0 0.08935897 0.04068502
## 26 1e-01      3   1.0 0.09205128 0.04881925
## 27 1e+00      3   1.0 0.08160256 0.03573688
## 28 1e+01      3   1.0 0.08666667 0.03427047
## 29 1e+02      3   1.0 0.07384615 0.03858219
## 30 1e+03      3   1.0 0.09160256 0.03760738
## 31 1e-01      2   2.0 0.09461538 0.04708250
## 32 1e+00      2   2.0 0.08679487 0.04565175
## 33 1e+01      2   2.0 0.09185897 0.03668505
## 34 1e+02      2   2.0 0.09698718 0.03385798
## 35 1e+03      2   2.0 0.08935897 0.04068502
## 36 1e-01      3   2.0 0.08942308 0.04889872
## 37 1e+00      3   2.0 0.08416667 0.03420679
## 38 1e+01      3   2.0 0.07647436 0.03804385
## 39 1e+02      3   2.0 0.07897436 0.03249803
## 40 1e+03      3   2.0 0.09423077 0.03137462
## 41 1e-01      2   3.0 0.08948718 0.04887906
## 42 1e+00      2   3.0 0.08679487 0.04565175
## 43 1e+01      2   3.0 0.09185897 0.03668505
## 44 1e+02      2   3.0 0.09698718 0.03385798
## 45 1e+03      2   3.0 0.08935897 0.04068502
## 46 1e-01      3   3.0 0.08685897 0.05171227
## 47 1e+00      3   3.0 0.08160256 0.03363067
## 48 1e+01      3   3.0 0.07647436 0.03804385
## 49 1e+02      3   3.0 0.07897436 0.03249803
## 50 1e+03      3   3.0 0.09166667 0.02926648
## 51 1e-01      2   4.0 0.09205128 0.04881925
## 52 1e+00      2   4.0 0.08679487 0.04565175
## 53 1e+01      2   4.0 0.09185897 0.03668505
## 54 1e+02      2   4.0 0.09698718 0.03385798
## 55 1e+03      2   4.0 0.08935897 0.04068502
## 56 1e-01      3   4.0 0.08423077 0.04204485
## 57 1e+00      3   4.0 0.08416667 0.03200003
## 58 1e+01      3   4.0 0.07903846 0.03700723
## 59 1e+02      3   4.0 0.07891026 0.03847631
## 60 1e+03      3   4.0 0.09166667 0.02665379
for ( iTry in 1:10 ) {
  tune.out=tune(svm, mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="polynomial",ranges=list(cost=c(5,10,20,50,100,200),degree=c(2,3),coef0=c(0,0.5,1,1.5,2)))
  print(tune.out$best.parameters)
  print(tune.out$best.performance)
}
##   cost degree coef0
## 9   20      3     0
## [1] 0.07102564
##    cost degree coef0
## 58   50      3     2
## [1] 0.05884615
##    cost degree coef0
## 36  200      3     1
## [1] 0.06358974
##    cost degree coef0
## 46   50      3   1.5
## [1] 0.06634615
##    cost degree coef0
## 58   50      3     2
## [1] 0.06634615
##    cost degree coef0
## 23  100      3   0.5
## [1] 0.06891026
##    cost degree coef0
## 23  100      3   0.5
## [1] 0.06897436
##    cost degree coef0
## 26   10      2     1
## [1] 0.07384615
##    cost degree coef0
## 11  100      3     0
## [1] 0.07403846
##    cost degree coef0
## 58   50      3     2
## [1] 0.06910256
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree coef0
##    50      3     2
## 
## - best performance: 0.06910256 
## 
## - Detailed performance results:
##    cost degree coef0      error dispersion
## 1     5      2   0.0 0.24782051 0.06022094
## 2    10      2   0.0 0.23487179 0.04699223
## 3    20      2   0.0 0.24506410 0.05041291
## 4    50      2   0.0 0.24256410 0.05851752
## 5   100      2   0.0 0.23750000 0.06180886
## 6   200      2   0.0 0.25269231 0.05598274
## 7     5      3   0.0 0.10217949 0.04828417
## 8    10      3   0.0 0.08679487 0.04993531
## 9    20      3   0.0 0.08173077 0.03968525
## 10   50      3   0.0 0.07147436 0.04312562
## 11  100      3   0.0 0.07653846 0.04179325
## 12  200      3   0.0 0.08166667 0.04321969
## 13    5      2   0.5 0.07679487 0.04374950
## 14   10      2   0.5 0.08185897 0.04172530
## 15   20      2   0.5 0.07923077 0.03728029
## 16   50      2   0.5 0.08692308 0.03877861
## 17  100      2   0.5 0.08692308 0.03684668
## 18  200      2   0.5 0.08685897 0.03462231
## 19    5      3   0.5 0.08429487 0.03830363
## 20   10      3   0.5 0.08942308 0.03873036
## 21   20      3   0.5 0.08179487 0.04638199
## 22   50      3   0.5 0.07923077 0.04250279
## 23  100      3   0.5 0.07666667 0.04507200
## 24  200      3   0.5 0.08423077 0.04341678
## 25    5      2   1.0 0.07679487 0.04374950
## 26   10      2   1.0 0.07929487 0.04109464
## 27   20      2   1.0 0.07923077 0.04443232
## 28   50      2   1.0 0.08179487 0.04502031
## 29  100      2   1.0 0.08692308 0.03684668
## 30  200      2   1.0 0.08685897 0.03462231
## 31    5      3   1.0 0.09198718 0.03865957
## 32   10      3   1.0 0.09198718 0.03865957
## 33   20      3   1.0 0.08179487 0.03600437
## 34   50      3   1.0 0.07160256 0.04339033
## 35  100      3   1.0 0.07923077 0.04101251
## 36  200      3   1.0 0.08423077 0.03991005
## 37    5      2   1.5 0.07423077 0.04448634
## 38   10      2   1.5 0.07160256 0.03988126
## 39   20      2   1.5 0.07923077 0.04443232
## 40   50      2   1.5 0.07923077 0.04101251
## 41  100      2   1.5 0.08692308 0.03684668
## 42  200      2   1.5 0.08685897 0.03462231
## 43    5      3   1.5 0.08429487 0.04365176
## 44   10      3   1.5 0.08942308 0.04057270
## 45   20      3   1.5 0.08179487 0.03391477
## 46   50      3   1.5 0.07160256 0.04339033
## 47  100      3   1.5 0.08179487 0.03985624
## 48  200      3   1.5 0.08679487 0.04369352
## 49    5      2   2.0 0.07423077 0.04448634
## 50   10      2   2.0 0.07416667 0.04448352
## 51   20      2   2.0 0.07923077 0.04443232
## 52   50      2   2.0 0.07923077 0.04101251
## 53  100      2   2.0 0.08692308 0.03684668
## 54  200      2   2.0 0.08685897 0.03462231
## 55    5      3   2.0 0.09198718 0.04396442
## 56   10      3   2.0 0.09198718 0.03865957
## 57   20      3   2.0 0.08948718 0.03277558
## 58   50      3   2.0 0.06910256 0.04544486
## 59  100      3   2.0 0.08179487 0.03985624
## 60  200      3   2.0 0.09192308 0.04849552

There is more variability in best parameters choice for the polynomial kernel; the best model performance could be a bit worse than that for radial kernel

Ch.9.7, Ex.7 d)

“Make some plots to back up your assertions in (b) and (c).”

svmfit <- svm(mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="linear", cost=10)
plot(svmfit, Auto, year~weight)

plot(svmfit, Auto, acceleration~cylinders)

plot(svmfit, Auto, horsepower~displacement)

svmfit <- svm(mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="radial", cost=2,gamma=2)
plot(svmfit, Auto, year~weight)

plot(svmfit, Auto, acceleration~cylinders)

plot(svmfit, Auto, horsepower~displacement)

svmfit <- svm(mpgCtg~., data=Auto[,c("weight","year","mpgCtg")], kernel="radial", cost=2,gamma=2)
plot(svmfit, Auto, year~weight)

svmfit <- svm(mpgCtg~., data=Auto[,c("weight","year","mpgCtg")], kernel="radial", cost=20,gamma=20)
plot(svmfit, Auto, year~weight)

svmfit <- svm(mpgCtg~., data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")], kernel="radial", cost=20,gamma=20)
plot(svmfit, Auto, year~weight)

summary(glm(mpgCtg~.,data=Auto[,c("cylinders","displacement","horsepower","weight","acceleration","year","mpgCtg")],family=binomial))
## 
## Call:
## glm(formula = mpgCtg ~ ., family = binomial, data = Auto[, c("cylinders", 
##     "displacement", "horsepower", "weight", "acceleration", "year", 
##     "mpgCtg")])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1999  -0.1126   0.0115   0.2249   3.3019  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -15.828787   5.653426  -2.800  0.00511 ** 
## cylinders     -0.015009   0.405220  -0.037  0.97045    
## displacement  -0.006745   0.009961  -0.677  0.49831    
## horsepower    -0.035608   0.023543  -1.512  0.13042    
## weight        -0.003986   0.001085  -3.673  0.00024 ***
## acceleration   0.007983   0.141357   0.056  0.95497    
## year           0.414204   0.072700   5.697 1.22e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 543.43  on 391  degrees of freedom
## Residual deviance: 159.34  on 385  degrees of freedom
## AIC: 173.34
## 
## Number of Fisher Scoring iterations: 8

The time it took to knit this file from beginning to end is about (seconds):

proc.time() - ptStart
##    user  system elapsed 
## 169.656   3.893 176.505